
global root_dir = "`1'"

include "$root_dir/code/config/config.do"

cap noi log using ${log_dir}/patstat2018b.log, replace name(dat)

*Handle empty arguments
global arg1 = cond("`2'" == "___EMPTY___", "", "`2'")
global arg2 = cond("`3'" == "___EMPTY___", "", "`3'")
global arg3 = cond("`4'" == "___EMPTY___", "", "`4'")
global arg4 = cond("`5'" == "___EMPTY___", "", "`5'")

if "$arg1" != "" {
    global weight_category "$arg1"
    di "Weight category: ${weight_category}"
}

if "$arg2" != "" {
    global weight_versions "$arg2"
    di "Weight versions: ${weight_versions}"
}

if "$arg3" != "" {
    global weight_window "$arg3"
    di "Weight window: ${weight_window}"
}

if "$arg4" != "" {
	global wtype "$arg4"
}
di "${wtype}"

capture noi {

* loads, cleanes and imputes missing values for PATSTAT 2018b 
*make sure you have the unzipped files in the folder ./DHOZ_replication_jpe/datasets/common_data/patstat_raw

*define directories just for patstat
global patstat_in "${root_dir}/datasets/common_data/patstat_raw"
global patstat_tmp "${root_dir}/datasets/common_data/patstat_2018b/tmp"
global patstat_out "${root_dir}/datasets/common_data/patstat_2018b"


cap mkdir ${patstat_tmp}

**************************************************************************************************************************
****			TLS201
**************************************************************************************************************************
*load the 10 tls201 parts
forvalues xx = 1/10 {
    clear
    local part = cond(`xx' < 10, "0`xx'", "10")
    insheet using ${patstat_in}/tls201_part`part'.txt, name
    save ${patstat_tmp}/tls201_part`part'.dta, replace
}

*append together
use ${patstat_tmp}/tls201_part01.dta, clear
forvalues xx=2/10 {
	local part = cond(`xx' < 10, "0`xx'", "10")
	append using ${patstat_tmp}/tls201_part`part', force
}

*format the dates on the application
*filing
gen appln_date = date(appln_filing_date,"YMD")
drop appln_filing_date
format appln_date %td
gen earliest_filing_date2 = date(earliest_filing_date,"YMD")
drop earliest_filing_date
ren earliest_filing_date2 earliest_filing_date
format earliest_filing_date %td

*publication
gen earliest_publn_date2 = date(earliest_publn_date,"YMD")
drop earliest_publn_date
ren earliest_publn_date2 earliest_publn_date
format earliest_publn_date %td

*intelectual property type
encode ipr_type, gen(ipr_type2)
drop ipr_type
ren ipr_type2 ipr_type
compress
save ${patstat_tmp}/tls201.dta, replace


**************************************************************************************************************************
****			BASIC INFO & APPLN DATE
**************************************************************************************************************************
*clean the dataset
ren appln_filing_year appln_year

keep appln_id appln_nr appln_auth appln_kind appln_year ipr_type ///
appln_date internat_appln_id granted nb_inventors ///
earliest_filing_year earliest_publn_year appln_nr_epodoc 

replace appln_kind=trim(appln_kind)
encode appln_kind, gen(appln_kind2)
drop appln_kind
ren appln_kind2 appln_kind

compress
save ${patstat_out}/appln_info.dta, replace


**************************************************************************************************************************
****			Family
**************************************************************************************************************************

use ${patstat_tmp}/tls201.dta, clear

drop appln_nr appln_kind appln_nr_original internat_appln_id ///
int_phase reg_phase nat_phase granted nb_inventors

*now lets look not at applications but patent families (docdb_family_id)
*firstly, we need the initial filing date of a family and distribute it to all aplications
ren appln_filing_year appln_year
bys docdb_family_id : egen int fam_earliest_appln_year = min(appln_year)
bys docdb_family_id : egen fam_earliest_appln_date = min(appln_date)
gen byte minapplndate=fam_earliest_appln_date==appln_date

*note: we originally generated some more variables here that were never used and removed to reduce size.
*In that process we used a bysort docdb_family_id minapplndate. We keep the sort to preserve the order of the data as we used it
sort docdb_family_id minapplndate

keep appln_id docdb_family_id docdb_family_size ///
nb_citing_docdb_fam fam_earliest_appln_year 

sort appln_id
compress
save ${patstat_out}/family_info.dta, replace

**************************************************************************************************************************
****			* CPC codes
**************************************************************************************************************************
*loads CPC codes for later use
*load 224 parts
forvalues xx=1/8 {
	clear
	insheet using ${patstat_in}/tls224_part0`xx'.txt, name
	save ${patstat_tmp}/tls224_part0`xx'.dta, replace
}

*append together
use ${patstat_tmp}/tls224_part01.dta, clear
forvalues xx=2/8 {
	append using ${patstat_tmp}/tls224_part0`xx'.dta, force
}

*keep only the applications and corresponding cpc codes (file ends up in long format with unique application cpc combinations)
keep appln_id cpc_cla
ren cpc_cla cpc_code
replace cpc_code=subinstr(cpc_code," ","",.)
duplicates drop
compress
save ${patstat_out}/CPC_codes.dta, replace


**************************************************************************************************************************
****			* TLS207 applicants & inventors
**************************************************************************************************************************
*load 207 and save as dta file
forvalues xx=1/3 {
	clear
	insheet using ${patstat_in}/tls207_part0`xx'.txt, name
	save ${patstat_tmp}/tls207_part0`xx'.dta, replace
}

*append together
use ${patstat_tmp}/tls207_part01.dta, clear
forvalues xx=2/3 {
	append using ${patstat_tmp}/tls207_part0`xx'.dta, force
}

compress
save ${patstat_tmp}/tls207.dta, replace

*save applicants and inventors
keep if applt_seq_nr > 0
keep appln_id person_id
compress
save ${patstat_out}/appln_applicants.dta, replace

use ${patstat_tmp}/tls207.dta, clear
keep if invt_seq_nr > 0
keep appln_id person_id
compress
save ${patstat_out}/appln_inventors.dta, replace


**************************************************************************************************************************
****			Country codes
**************************************************************************************************************************
*save country codes used by patstat for later use (correspond to iso alpha 3 codes)
*also saves some more variables like OECD membership etc for completeness
clear
insheet using ${patstat_in}/tls801_part01.txt, name
foreach vvv in state_indicator eu_member epo_member oecd_member discontinued{
	gen byte `vvv'_ind = 0
	replace `vvv'_ind=1 if `vvv'=="Y"
	drop `vvv'
	ren `vvv'_ind `vvv'
}
encode continent, gen(continent_code)
compress
save ${patstat_out}/country_codes.dta, replace


**************************************************************************************************************************
****			PUBLICATION INFO
**************************************************************************************************************************

*load tls 211
forvalues xx=1/5 {
	clear
	insheet using ${patstat_in}/tls211_part0`xx'.txt, name
	save ${patstat_tmp}/tls211_part0`xx'.dta, replace
}

*append together
use ${patstat_tmp}/tls211_part01, clear
forvalues xx=2/5 {
	append using ${patstat_tmp}/tls211_part0`xx'.dta, force
}
compress
save ${patstat_tmp}/tls211.dta, replace

*format publication date and year
drop publn_nr_original publn_lg
ren publn_date pat_publn_date
replace  pat_publn_date=subinstr(pat_publn_date,"-","/",.)
gen publn_date = date(pat_publn_date,"YMD")
gen publn_year = year(publn_date)
replace publn_year=. if publn_year==9999
replace publn_date=. if publn_year==.
drop pat_publn_date
drop if appln_id==0
format publn_date %td
foreach vvv in publn_nr publn_auth publn_kind {
	replace `vvv' = trim(`vvv')
}
compress
save ${patstat_out}/publn_info.dta, replace


**************************************************************************************************************************
****			Priorities
**************************************************************************************************************************


clear
insheet using ${patstat_in}/tls204_part01.txt, name
save ${patstat_out}/priorities.dta, replace



**************************************************************************************************************************
****			* Persons
**************************************************************************************************************************

*import information on persons associated with an application
forvalues xx=1/5 {
	clear
	insheet using ${patstat_in}/tls906_part0`xx'.txt, name
	save ${patstat_tmp}/tls906_part0`xx'.dta, replace
}

*remove person names
forvalues xx=1/5 {
	use ${patstat_tmp}/tls906_part0`xx'.dta, clear
	keep person_id person_ctry_code psn_id psn_sector 
	save ${patstat_tmp}/tls906_part0`xx'_nonames.dta, replace
}

*append together
use ${patstat_tmp}/tls906_part01_nonames.dta, clear
forvalues xx=2/5 {
	append using ${patstat_tmp}/tls906_part0`xx'_nonames.dta, force
}
compress
save ${patstat_tmp}/tls906_nonames.dta, replace


*generate more aggregated sectors
ren person_ctry_code country
ren psn_id hrm_id
ren psn_sector sector
gen secteur = 1
replace secteur=2 if sector=="INDIVIDUAL"
replace secteur=3 if sector=="UNIVERSITY" | sector=="COMPANY UNIVERSITY" | sector=="GOV NON-PROFIT UNIVERSITY" | sector=="COMPANY GOV NON-PROFIT UNIVERSITY"
replace secteur=4 if sector=="GOV NON-PROFIT" | sector=="COMPANY GOV NON-PROFIT"
replace secteur=5 if sector=="HOSPITAL" |  sector=="COMPANY HOSPITAL" |  sector=="UNIVERSITY HOSPITAL" |  sector=="GOV NON-PROFIT HOSPITAL"
replace secteur=6 if sector=="UNKNOWN" | sector==""
label define categ 1 "COMPANY" 2 "INDIVIDUAL" 3 "UNIVERSITY"  4 "GOV NON-PROFIT"  5 "HOSPITAL" 6 "UNKNOWN"
label values secteur categ
drop sector
ren secteur sector
compress
save ${patstat_out}/persons_info.dta, replace



**************************************************************************************************************************
****			AUTHORITY & INVENTOR COUNTRY
**************************************************************************************************************************

*crucially, we try to obtain a set of inventors, that is as complete as possible
*first some cleaning
use ${patstat_out}/appln_inventors.dta, clear
mmerge person_id using ${patstat_out}/persons_info.dta, unmatched(none) ukeep(country)
drop person_id
ren country invt_country
replace invt_country = ltrim(rtrim(invt_country))
drop if invt_country==""
drop if invt_country=="\N"
sort appln_id
duplicates drop
compress
save ${patstat_out}/appln_inventor_country.dta, replace


use  ${patstat_out}/appln_info.dta, clear
keep appln_id
mmerge appln_id using ${patstat_out}/priorities.dta, unmatched(master)
mmerge prior_appln_id using ${patstat_out}/appln_info.dta, unmatched(master) ukeep(appln_auth) umatch(appln_id)
ren appln_auth prio_auth
replace prio_auth = ltrim(rtrim(prio_auth))
drop if  prio_auth==""
drop if  prio_auth=="\N"
sort appln_id
compress
save ${patstat_out}/appln_prio_auth.dta, replace

*get inventor countries on family instead of just application level
use ${patstat_out}/family_info.dta, clear
keep appln_id docdb_family_id
mmerge appln_id using ${patstat_out}/appln_inventor_country.dta, unmatched(master)
replace invt_country = ltrim(rtrim(invt_country))
drop if  invt_country==""
drop if  invt_country=="\N"
drop appln_id
duplicates drop
ren invt_country fam_invt_country
compress
save ${patstat_out}/docdbid_fam_invt_ctry.dta, replace

*adding together. that means  if an applicication has a missing inventor country, we try to get it from the family
*should that fail, we use the prioritized patenting authority of the application, should that fail the actual patenting authority, giving us a complete set
use  ${patstat_out}/appln_info.dta, clear
keep appln_id appln_auth
mmerge appln_id using ${patstat_out}/family_info.dta, unmatched(master) ukeep(docdb_family_id)
mmerge appln_id using ${patstat_out}/appln_inventor_country.dta, unmatched(master)
mmerge docdb_family_id using ${patstat_out}/docdbid_fam_invt_ctry.dta, unmatched(master)
mmerge appln_id using ${patstat_out}/appln_prio_auth.dta, unmatched(master)
drop docdb
duplicates drop
replace invt_country = fam_invt_country if invt_country==""
replace invt_country = prio_auth if invt_country==""
replace invt_country = appln_auth if invt_country==""
keep appln_id invt_country
duplicates drop
gen ll=length(inv)
replace invt_country="U" if ll>2
drop ll
compress
save ${patstat_out}/appln_inventor_country_nomissing.dta, replace


erase ${patstat_out}/docdbid_fam_invt_ctry.dta 


**************************************************************************************************************************
****			APPLICANT COUNTRY
**************************************************************************************************************************

*careful, applicant is not necessarily the same as inventor
use ${patstat_out}/appln_applicants.dta, clear
mmerge person_id using ${patstat_out}/persons_info.dta, unmatched(none) ukeep(country)
drop person_id
ren country applt_country
replace appl_country = ltrim(rtrim(applt_country))
drop if appl_country==""
drop if appl_country=="\N"
sort appln_id
duplicates drop
compress
save ${patstat_out}/appln_applicant_country.dta, replace


**************************************************************************************************************************
****			APPLN ID - IPC
**************************************************************************************************************************

*once again loading raw data
forvalues xx=1/8 {
	clear
	insheet using ${patstat_in}/tls209_part0`xx'.txt, name
	save ${patstat_tmp}/tls209_part0`xx'.dta, replace
}

use ${patstat_tmp}/tls209_part01.dta, clear
forvalues xx=2/8 {
	append using ${patstat_tmp}/tls209_part0`xx'.dta, force
}
compress
save ${patstat_tmp}/tls209.dta, replace

*collect the ipc codes associated with each application
keep appln_id ipc_class_symbol
ren ipc_class_symbol ipc_code
replace ipc_code=subinstr(ipc_code," ","",.)
duplicates drop
compress
save ${patstat_out}/ipc_codes.dta, replace


************************************************************************************************************************
****			APPLN ID - technical field through IPC-technical field correspondance
**************************************************************************************************************************

* IPC - technical field

clear
import delimited using ${patstat_in}/tls901_part01.txt, stringcols(2)  varnames(1)
save ${patstat_out}/ipc_techn_field.dta, replace
* we need these codes because they technically are machinery and thus appear alter on again, but we do not consider that they may have automation codes
drop if ipc_maingroup_symbol== "F24T" | ipc_maingroup_symbol== "F24V" | ipc_maingroup_symbol== "F24S" | ipc_maingroup_symbol== "G16H" 
export delimited using ${root_dir}/classification/patstat/ipc_techn_fields.csv, replace

* List of technical fields
use ${patstat_out}/ipc_techn_field.dta, clear
drop ipc_maingroup_symbol
duplicates drop
destring *, replace
sort techn_field_nr
save ${patstat_out}/techn_fields.dta, replace


* Appln_id - technical field. load and add together
forvalues xx=1/2 {
	clear
	insheet using ${patstat_in}/tls230_part0`xx'.txt, name
	save ${patstat_tmp}/tls230_part0`xx'.dta, replace
}


use ${patstat_tmp}/tls230_part01.dta, clear
forvalues xx=2/2 {
	append using ${patstat_tmp}/tls230_part0`xx'.dta, force
}
compress
save ${patstat_out}/appln_id_techn_field.dta, replace

**************************************************************************************************************************
****			FAMILY INFO
**************************************************************************************************************************

* family size by docdb / number of patent offices
use ${patstat_out}/appln_info.dta, clear
keep appln_id appln_auth
mmerge appln_id using ${patstat_out}/family_info.dta, unmatched(master) ukeep(docdb_family_id)
keep docdb_family_id appln_auth
duplicates drop
gen x=1
bysort docdb : egen nb_pat = sum(x)
keep docdb_family_id nb_pat
duplicates drop
ren nb_pat famsize
compress
save ${patstat_out}/docdb_families_famsize.dta, replace 



* family size by docdb / number of patents
use ${patstat_out}/appln_info.dta, clear
keep appln_id appln_auth appln_nr
mmerge appln_id using ${patstat_out}/family_info.dta, unmatched(master) ukeep(docdb_family_id)
keep docdb_family_id appln_auth appln_nr
duplicates drop
gen x=1
bysort docdb : egen nb_pat = sum(x)
keep docdb_family_id nb_pat
duplicates drop
ren nb_pat famsize
compress
save ${patstat_out}/docdb_families_famsize_nb_pat.dta, replace 


* Create family variables for docdb families 
use ${patstat_out}/appln_info.dta, clear
keep appln_id appln_auth appln_year
mmerge appln_id using ${patstat_out}/family_info.dta, unmatched(master) ukeep(docdb_family_id)
bysort docdb_family_id : egen fam_year = min(appln_year)
drop appln_id  
duplicates drop
foreach office in EP JP US GB FR DE{
gen appln_`office' = appln_auth=="`office'"
bysort docdb_family_id : egen fam_`office' = sum(appln_`office')
}
gen EP_or_US_A = fam_EP>0 | fam_US>0
gen EP_or_US_B = (fam_EP>0 | (fam_DE>0 & fam_FR>0 & fam_GB>0)) | fam_US>0
gen EP_or_US_C = (fam_EP>0 | fam_DE>0 | fam_FR>0 | fam_GB>0) | fam_US>0
gen biadic_A = fam_EP>0 & fam_US>0
gen biadic_B = (fam_EP>0 | (fam_DE>0 & fam_FR>0 & fam_GB>0)) & fam_US>0
gen biadic_C = (fam_EP>0 | fam_DE>0 | fam_FR>0 | fam_GB>0) & fam_US>0
gen triadic_A = fam_EP>0 & fam_JP>0 & fam_US>0
gen triadic_B = (fam_EP>0 | (fam_DE>0 & fam_FR>0 & fam_GB>0)) & fam_JP>0 & fam_US>0
gen triadic_C = (fam_EP>0 | fam_DE>0 | fam_FR>0 | fam_GB>0) & fam_JP>0 & fam_US>0
keep docdb_family_id fam_EP fam_US fam_JP EP_or_US* biadic* triadic* fam_year
replace fam_EP=1 if fam_EP>1
replace fam_US=1 if fam_US>1
replace fam_JP=1 if fam_JP>1
duplicates drop
mmerge docdb_family_id using ${patstat_out}/docdb_families_famsize.dta, unmatched(master) 
ren famsize famsize_offices
mmerge docdb_family_id using ${patstat_out}/docdb_families_famsize_nb_pat.dta, unmatched(master) 
ren famsize famsize_patents
gen hvi=0
replace hvi=1 if famsize_patents>1
drop _m
compress
save ${patstat_out}/docdb_families.dta, replace 

**************************************************************************************************************************
****			CITATIONS
**************************************************************************************************************************

* Table 212 citations, load and append together
forvalues xx=1/8 {
	clear
	insheet using ${patstat_in}/tls212_part0`xx'.txt, name
	save ${patstat_tmp}/tls212_part0`xx'.dta, replace
}

use ${patstat_tmp}/tls212_part01.dta, clear
forvalues xx=2/8 {
	append using ${patstat_tmp}/tls212_part0`xx'.dta, force
}
compress
save ${patstat_tmp}/tls212.dta, replace


* Table 228 citations by docdbid, load and append together
forvalues xx=1/2 {
	clear
	insheet using ${patstat_in}/tls228_part0`xx'.txt, name
	save ${patstat_tmp}/tls228_part0`xx'.dta, replace
}
use ${patstat_tmp}/tls228_part01.dta, clear
append using ${patstat_tmp}/tls228_part02.dta, force
compress
save ${patstat_tmp}/tls228.dta, replace


* citations, cleaning
use ${patstat_tmp}/tls212.dta, clear
drop if cited_pat_publn_id==0 & cited_appln_id==0
keep pat_publn_id cited_pat_publn_id cited_appln_id citn_origin
encode citn_origin, gen(cit_origin)
drop citn_origin
ren pat_publn_id citing_pat_publn_id
compress
save ${patstat_out}/citations.dta, replace

* citations appln_id to appln_id
use ${patstat_out}/citations.dta, clear
gen byte citn_applicant0 = 0
replace citn_applicant0=1 if cit_origin==2
drop cit_origin
mmerge cited_pat_publn_id using ${patstat_out}/publn_info.dta, unmatched(master) umatch(pat_publn_id) ukeep(appln_id)
replace cited_appln_id = appln_id if cited_appln_id==0
drop cited_pat_publn_id _m appln_id
duplicates drop
mmerge citing_pat_publn_id using ${patstat_out}/publn_info.dta, unmatched(master) umatch(pat_publn_id) ukeep(appln_id)
drop citing_pat_publn_id _m
duplicates drop
ren appln_id citing_appln_id
bysort cited_appln_id citing_appln_id : egen citn_applicant=max(citn_applicant0)
drop citn_applicant0 
duplicates drop
mmerge citing_appln_id using ${patstat_out}/appln_info.dta, umatch(appln_id) unmatched(master) ukeep(appln_year)
ren appln_year citing_appln_year
mmerge cited_appln_id using ${patstat_out}/appln_info.dta, umatch(appln_id) unmatched(master) ukeep(earliest_publn_year)
ren earliest_publn_year cited_earliest_publn_year
duplicates drop
drop _m
compress
save ${patstat_out}/citations_by_appln_id.dta, replace


* Identify self-cites
* (do in two rounds because of large data)
use ${patstat_out}/citations_by_appln_id.dta, clear
sort cited_appln_id
_pctile cited_appln_id, percentiles(50)
local N `r(r1)'
di "Splitting on cited_appln_id = `N'"
keep if cited_appln_id <= `N'
mmerge cited_appln_id using ${patstat_out}/appln_inventors.dta, umatch(appln_id) unmatched(none)
rename person_id person_id_cited
drop _merge 
mmerge citing_appln_id using ${patstat_out}/appln_inventors.dta, umatch(appln_id) unmatched(none)
rename person_id person_id_citing
drop _merge 
gen x = person_id_cited == person_id_citing
bys cited_appln_id citing_appln_id: egen self_cite = sum(x)
keep if self_cite > 0
drop self_cite x person_id_cited person_id_citing
duplicates drop
compress
*end of first round
save ${patstat_tmp}/self_cite.dta, replace

use ${patstat_out}/citations_by_appln_id.dta, clear
keep if cited_appln_id > `N'
mmerge cited_appln_id using ${patstat_out}/appln_inventors.dta, umatch(appln_id) unmatched(none)
rename person_id person_id_cited
drop _merge 
mmerge citing_appln_id using ${patstat_out}/appln_inventors.dta, umatch(appln_id) unmatched(none)
rename person_id person_id_citing
drop _merge 
gen x = person_id_cited == person_id_citing
bys cited_appln_id citing_appln_id: egen self_cite = sum(x)
keep if self_cite > 0
drop self_cite x person_id_cited person_id_citing
duplicates drop
compress
append using ${patstat_tmp}/self_cite.dta
save ${patstat_tmp}/self_cite.dta, replace


* citation counts excluding self-citations
use ${patstat_out}/citations_by_appln_id.dta, clear
mmerge cited_appln_id citing_appln_id using ${patstat_tmp}/self_cite.dta, unmatched(master)
drop if _merge==3
drop _merge
gen x=1
*generate the citations in the first 5 years after publication
*including rolling window
gen fiveyr=(citing_appln_year - cited_earliest_publn_year)<6
bysort cited_appln_id : egen cit_alltime = sum(x)
bysort cited_appln_id : egen cit_5yrs = sum(fiveyr)
bysort cited_appln_id : egen cit_alltime_app = sum(x*citn_applicant)
bysort cited_appln_id : egen cit_5yrs_app = sum(fiveyr*citn_applicant)
keep cited_appln_id cit_*
duplicates drop
compress
save ${patstat_tmp}/forward_citations_exclself.dta, replace

* do some corretions and cleaning, overwrite previous file
use ${patstat_out}/appln_info.dta, clear
mmerge appln_id using ${patstat_tmp}/forward_citations_exclself.dta, unmatched(master) umatch(cited_appln_id)
drop _m
replace cit_alltime = 0 if cit_alltime==.
replace cit_5yrs = 0 if cit_5yrs==.
replace cit_alltime_app = 0 if cit_alltime_app==.
replace cit_5yrs_app = 0 if cit_5yrs_app==.
keep appln_id cit_*
drop if appln_id==0
compress
*overwrite previous file
save ${patstat_out}/forward_citations_exclself.dta, replace


* citation counts including self-citations, same as above but without the self-cite correction
use ${patstat_out}/citations_by_appln_id.dta, clear
gen x=1
gen fiveyr=(citing_appln_year - cited_earliest_publn_year<6)
bysort cited_appln_id : egen cit_alltime = sum(x)
bysort cited_appln_id : egen cit_5yrs = sum(fiveyr)
bysort cited_appln_id : egen cit_alltime_app = sum(x*citn_applicant)
bysort cited_appln_id : egen cit_5yrs_app = sum(fiveyr*citn_applicant)
keep cited_appln_id cit_*
duplicates drop
compress
save ${patstat_tmp}/forward_citations_inclself.dta, replace

use ${patstat_out}/appln_info.dta, clear
mmerge appln_id using ${patstat_tmp}/forward_citations_inclself.dta, unmatched(master) umatch(cited_appln_id)
drop _m
replace cit_alltime = 0 if cit_alltime==.
replace cit_5yrs = 0 if cit_5yrs==.
replace cit_alltime_app = 0 if cit_alltime_app==.
replace cit_5yrs_app = 0 if cit_5yrs_app==.
keep appln_id cit_*
drop if appln_id==0
compress
save ${patstat_out}/forward_citations_inclself.dta, replace


* average citations excl self-citations by publn_auth & publn_year
use ${patstat_out}/forward_citations.dta, clear
mmerge appln_id using ${patstat_out}/publn_info.dta, unmatched(master) ukeep(publn_auth publn_year)
bysort appln_id : egen min_publn_year = min(publn_year)
replace publn_year = min_publn_year
keep publn_year appln_id publn_auth cit_5yrs cit_alltime
duplicates drop
bys publn_auth publn_year : egen avg_cit_alltime = mean(cit_alltime)
bys publn_auth publn_year : egen avg_cit_5yrs = mean(cit_5yrs)
keep publn_auth publn_year avg*
duplicates drop
compress
save ${patstat_out}/average_citations.dta, replace


* Citations family to family, excluding self-citations 
use ${patstat_out}/citations_by_appln_id.dta, clear
mmerge cited_appln_id citing_appln_id using ${patstat_tmp}/self_cite, unmatched(master)
drop if _merge==3
*macth the citations
mmerge cited_appln_id using ${patstat_out}/family_info.dta, unmatched(master) ukeep(docdb_family_id) umatch(appln_id)
ren docdb_family_id cited_docdb_family_id
mmerge citing_appln_id using ${patstat_out}/family_info.dta, unmatched(master) ukeep(docdb_family_id) umatch(appln_id)
ren docdb_family_id citing_docdb_family_id
bysort cited_docdb_family_id : egen cited_fam_publn_year=max(cited_earliest_publn_year) if cited_earliest_publn_year!=9999 & cited_earliest_publn_year!=.
gsort cited_docdb_family_id cited_fam_publn_year
bysort cited_docdb_family_id : replace cited_fam_publn_year=cited_fam_publn_year[_n-1] if cited_fam_publn_year==.
bysort citing_docdb_family_id : egen citing_fam_appln_year=min(citing_appln_year) 
* check no missing data
keep cited_docdb_family_id citing_docdb_family_id cited_fam_publn_year citing_fam_appln_year
duplicates drop
compress
gen x=1
gen fiveyr=(citing_fam_appln_year-cited_fam_publn_year)<6
gen threeyr=(citing_fam_appln_year-cited_fam_publn_year)<4
bysort cited_docdb_family_id : egen cit_alltime = sum(x)
bysort cited_docdb_family_id : egen cit_5yrs = sum(fiveyr)
bysort cited_docdb_family_id : egen cit_3yrs = sum(threeyr)
keep cited_docdb_family_id cit_*
duplicates drop
compress
save ${patstat_tmp}/citations_by_docdb_id_exclself.dta, replace

*merge with family info, do some corrections
use ${patstat_out}/family_info.dta, clear
keep docdb_family_id
duplicates drop
mmerge docdb_family_id using ${patstat_tmp}/citations_by_docdb_id_exclself.dta, unmatched(master) umatch(cited_docdb_family_id)
replace cit_alltime = 0 if cit_alltime==.
replace cit_5yrs = 0 if cit_5yrs==.
replace cit_3yrs = 0 if cit_3yrs==.
drop _m
compress
save ${patstat_out}/citations_by_docdb_id_exclself.dta, replace


* Citations family to family, including self-citations 
use ${patstat_out}/citations_by_appln_id.dta, clear
mmerge cited_appln_id using ${patstat_out}/family_info.dta, unmatched(master) ukeep(docdb_family_id) umatch(appln_id)
ren docdb_family_id cited_docdb_family_id
mmerge citing_appln_id using ${patstat_out}/family_info.dta, unmatched(master) ukeep(docdb_family_id) umatch(appln_id)
ren docdb_family_id citing_docdb_family_id
bysort cited_docdb_family_id : egen cited_fam_publn_year=max(cited_earliest_publn_year) if cited_earliest_publn_year!=9999 & cited_earliest_publn_year!=.

*impute missing vlaues from the closest year before, replace missing values
gsort cited_docdb_family_id cited_fam_publn_year
bysort cited_docdb_family_id : replace cited_fam_publn_year=cited_fam_publn_year[_n-1] if cited_fam_publn_year==.
bysort citing_docdb_family_id : egen citing_fam_appln_year=min(citing_appln_year) 
* check no missing data
keep cited_docdb_family_id citing_docdb_family_id cited_fam_publn_year citing_fam_appln_year
duplicates drop
gen x=1
gen fiveyr=(citing_fam_appln_year-cited_fam_publn_year<6)
bysort cited_docdb_family_id : egen cit_alltime = sum(x)
bysort cited_docdb_family_id : egen cit_5yrs = sum(fiveyr)
keep cited_docdb_family_id cit_*
duplicates drop
compress
save ${patstat_tmp}/citations_by_docdb_id_inclself.dta, replace

*clean some missing values
use ${patstat_out}/family_info.dta, clear
keep docdb_family_id
duplicates drop
mmerge docdb_family_id using ${patstat_tmp}/citations_by_docdb_id_inclself.dta, unmatched(master) umatch(cited_docdb_family_id)
replace cit_alltime = 0 if cit_alltime==.
replace cit_5yrs = 0 if cit_5yrs==.
drop _m
compress
save ${patstat_out}/citations_by_docdb_id_inclself.dta, replace

}
if _rc == 0 {
    display "Execution finished successfully."
}
else {
    display "Execution finished with errors."
}

cap log close dat